rm(list = ls())

library(tidyverse)
library(haven) # for read_sas, read_dta
library(lfe)
library(fuzzyjoin)
library(sjmisc) #for row_sums
library(lubridate)
select = dplyr::select

source("functions.R")

person_year = read_dta("data_prep/person_year_R.dta")

load("data_output/xwalk_occupation_manual.Rdata")
load(file = 'data_output/ytl_persons_brothers.Rdata')


# Add crosswalks ----------------------------------------------------------

# Add subregion of birth

subregion_xwalk = read_dta("data_input/xwalk_county_subregion.dta")
subregion_xwalk = subregion_xwalk %>% select(vuosi, county, subregion)
names(subregion_xwalk) = c("vuosi", "skunta", "subregion_skunta")

person_year = person_year %>% mutate(skunta = as.numeric(skunta)) %>% 
  left_join(subregion_xwalk) #join by vuosi, skunta

# Add cpi and udpated occupation xwalk

cpi = read_delim("data_input/cpi.csv", delim = ",") %>% 
  mutate(cpi = cpi/cpi[vuosi == 2015])

person_year = person_year %>% 
  left_join(xwalk_occupation_manual) %>% # Join by vuosi, ammattikoodi_k
  left_join(cpi) %>% # join by vuosi
  mutate(tyotu_k = tyotu_k/cpi,
         tyrtuo_k = tyrtuo_k/cpi)


# Tidy --------------------------------------------------------------------


# Add cumulative activity variable
person_year = person_year %>% 
  filter(ika > 17) %>% 
  group_by(shnro) %>% 
  arrange(ika) %>% 
  mutate(employed = cumsum(ptoim1 == 11),
         study = cumsum(ptoim1 == 22),
         unemp = cumsum(ptoim1 == 12),
         ptoim_other = cumsum(ptoim1 %in% c(24, 25, 99)),
         ptoim_count = row_number()) %>% 
  ungroup()

# Create labor earnings and rename variables
earnings = person_year %>%
  row_sums(tyotu_k, tyrtuo_k, n = 1, var = "inc_labor") %>% 
  filter(vuosi %in% 1998:2018) %>% 
  mutate(emp_11 = ptoim1 %in% 11,
            occ_new = occ_manual_new,
            field = ututku_ala,
            educ_years = ututku_aste,
            ptoim = ptoim1,
            emp_type = oyr_omist_tyyppi, #dumb name
            firm = plant_id) %>% 
  ungroup()

# Add to persons
dta = persons %>% 
  left_join(earnings, by = c("shnro" = "shnro"))

# Add variables  ---------------------------------------------------------

# Personality data is available for birth cohorts from 1962 to 1979
# IQ data is available from 1962 to 1991

dta = dta %>%
  unite(educ, educ_years, field, remove = F) %>% 
  unite(job, educ, occ_new, remove = F) %>% 
  mutate(college = educ_years %in% c("5", "6", "7", "8"),
         pub = fct_explicit_na(as.factor(emp_type == 2 | emp_type == 3)),  # public employment 
         occ1 = str_sub(occ_new, 1, 1),
         occ1_clerical = occ1 == 3  | occ1 == 4,
         occ1_blue = occ1 == 7 | occ1 == 8,
         occ1_other = occ1 == 0 | occ1 == 6 | occ1 == 9,
         educ_y = recode(educ_years, 
                         .default = 9L,
                         "3" = 12L,
                         "4" = 12L,
                         "5" = 14L,
                         "6" = 15L,
                         "7" = 17L,
                         "8" = 22L),
         field_coarce = recode(field,
                               .default = "other",
                               "01" = "hass",
                               "02" = "hass",
                               "03" = "hass",
                               "04" = "buslaw",
                               "05" = "stem",
                               "06" = "stem",
                               "07" = "stem",
                               "08" = "other",
                               "09" = "other",
                               "10" = "other",
                               "99" = "other",
                               "00" = "other"))

# Save
save(dta, file = 'data_output/persons_years.Rdata')
